# conv-s2s
#mkdir ../out/conv-s2s-d0.3
#CUDA_VISIBLE_DEVICES=0,1 fairseq-train data-bin/writingPrompts -a fconv_self_att_wp \
#    --lr 0.25 --clip-norm 0.1 --max-tokens 1500 --lr-scheduler reduce_lr_on_plateau \
#    --decoder-attention True --encoder-attention False \
#    --criterion label_smoothed_cross_entropy --weight-decay .0000001 --label-smoothing 0 \
#    --source-lang wp_source --target-lang wp_target \
#    --gated-attention True --self-attention True --project-input True \
#    --pretrained False --tensorboard-logdir ../out/conv-s2s-d0.3 \
#    --log-interval  1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
#    --save-dir ../out/conv-s2s-d0.3
CUDA_VISIBLE_DEVICES=0,1 fairseq-train --user-dir coherence_story --task translation_bpe data-bin/writingPrompts-prompt2story_bpe500_fix -a fconv_self_att_wp \
    --lr 0.25 --clip-norm 0.1 --max-tokens 4000 --lr-scheduler reduce_lr_on_plateau \
    --decoder-attention True --encoder-attention False \
    --criterion story_cross_entropy_without_event --weight-decay .0000001 --label-smoothing 0 \
    --source-lang prompt --target-lang story \
    --gated-attention True --self-attention True --project-input True \
    --pretrained False --tensorboard-logdir ../out/conv-s2s-d0.3_bpe500 \
    --log-interval  1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
    --skip-invalid-size-inputs-valid-test \
    --save-dir ../out/conv-s2s-d0.3_bpe500

# conv-s2s fusion
#mkdir ../out/conv-s2s-d0.3-fusion
#CUDA_VISIBLE_DEVICES=0,1,2 fairseq-train data-bin/writingPrompts -a fconv_self_att_wp_fusion \
#    --lr 0.25 --clip-norm 0.1 --max-tokens 1500 --lr-scheduler reduce_lr_on_plateau \
#    --decoder-attention True --encoder-attention False \
#    --criterion label_smoothed_cross_entropy --weight-decay .0000001 --label-smoothing 0 \
#    --source-lang wp_source --target-lang wp_target \
#    --gated-attention True --self-attention True --project-input True \
#    --tensorboard-logdir ../out/conv-s2s-d0.3-fusion \
#    --log-interval  1000 --log-format simple --distributed-world-size 3 --dropout 0.3 \
#    --save-dir ../out/conv-s2s-d0.3-fusion --pretrained True --pretrained-checkpoint ../out/conv-s2s-d0.3/checkpoint_best.pt
CUDA_VISIBLE_DEVICES=0,1 fairseq-train --user-dir coherence_story --task translation_bpe data-bin/writingPrompts-prompt2story_bpe500_fix -a fconv_self_att_wp_fusion \
    --lr 0.25 --clip-norm 0.1 --max-tokens 4000 --lr-scheduler reduce_lr_on_plateau \
    --decoder-attention True --encoder-attention False \
    --criterion story_cross_entropy_without_event --weight-decay .0000001 --label-smoothing 0 \
    --source-lang prompt --target-lang story \
    --gated-attention True --self-attention True --project-input True \
    --tensorboard-logdir ../out/conv-s2s-d0.3-fusion_bpe500 \
    --log-interval  1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
    --skip-invalid-size-inputs-valid-test \
    --save-dir ../out/conv-s2s-d0.3-fusion_bpe500 --pretrained True --pretrained-checkpoint ../out/conv-s2s-d0.3_bpe500/checkpoint_best.pt

# trans-lm
#mkdir ../out/transformer-lm
#CUDA_VISIBLE_DEVICES=0,1 fairseq-train --task language_modeling data-bin/writingPrompts-lm \
#    --save-dir ../out/transformer-lm --arch transformer_lm_gpt \
#    --lr 0.25 --clip-norm 0.1 --max-tokens 1500 --lr-scheduler reduce_lr_on_plateau \
#    --criterion label_smoothed_cross_entropy --weight-decay .0000001 --label-smoothing 0 \
#    --tensorboard-logdir ../out/transformer-lm \
#    --log-interval  1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
#    --tokens-per-sample 1500 --sample-break-mode eos

# conv-lm
#mkdir ../out/conv-lm
#CUDA_VISIBLE_DEVICES=0,1 fairseq-train --task language_modeling data-bin/writingPrompts-lm \
#    --save-dir ../out/conv-lm --arch fconv_lm \
#    --lr 0.25 --clip-norm 0.1 --max-tokens 1500 --lr-scheduler reduce_lr_on_plateau \
#    --criterion label_smoothed_cross_entropy --weight-decay .0000001 --label-smoothing 0 \
#    --tensorboard-logdir ../out/conv-lm \
#    --log-interval  1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
#    --tokens-per-sample 1500 --sample-break-mode eos

## event to story model
#mkdir ../out/bilstm_transformer-lm
#CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train --task hierstory data-bin/writingPrompts-event2story \
#    --save-dir ../out/bilstm_transformer-lm --arch bilstm_h_transformer_lm_gpt \
#    --source-lang event --target-lang new --max-target-positions 1500 \
#    --lr 0.25 --clip-norm 0.1 --max-tokens 1500 --lr-scheduler reduce_lr_on_plateau \
#    --criterion story_cross_entropy --weight-decay .0000001 --label-smoothing 0 \
#    --tensorboard-logdir ../out/bilstm_transformer-lm \
#    --log-interval  1000 --log-format simple --distributed-world-size 2 --dropout 0.3
# event to story model 500 bpe
#CUDA_VISIBLE_DEVICES=0,1 fairseq-train --task hierstory_bpe data-bin/writingPrompts-event2story_bpe500 \
#    --save-dir ../out/bilstm_transformer-lm_bpe500 --arch bilstm_h_transformer_lm_gpt \
#    --source-lang event --target-lang new  --max-tokens 4000 \
#    --optimizer adam   --lr 0.001 \
#    --criterion story_cross_entropy --weight-decay .0001 --label-smoothing 0 \
#    --tensorboard-logdir ../out/bilstm_transformer-lm_bpe500 \
#    --log-interval  1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
#    --skip-invalid-size-inputs-valid-test
#CUDA_VISIBLE_DEVICES=0,1 fairseq-train --task hierstory_bpe data-bin/writingPrompts-event2story_bpe500 \
#    --save-dir ../out/bilstm_transformer-lm_bpe500 --arch bilstm_h_transformer_lm_gpt \
#    --source-lang event --target-lang new  --max-tokens 4000 \
#    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
#    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
#    --criterion story_cross_entropy --weight-decay .0001 --label-smoothing 0 \
#    --tensorboard-logdir ../out/bilstm_transformer-lm_bpe500 \
#    --log-interval  1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
#    --skip-invalid-size-inputs-valid-test
#CUDA_VISIBLE_DEVICES=0,1 fairseq-train --task hierstory_bpe data-bin/writingPrompts-event2story_bpe500 \
#    --save-dir ../out/bilstm_transformer-lm_bpe500_wo_event --arch bilstm_h_transformer_lm_gpt \
#    --source-lang event --target-lang new  --max-tokens 4000 \
#    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
#    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
#    --criterion story_cross_entropy_without_event --weight-decay .0001 --label-smoothing 0 \
#    --tensorboard-logdir ../out/bilstm_transformer-lm_bpe500_wo_event \
#    --log-interval  1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
#    --skip-invalid-size-inputs-valid-test
#CUDA_VISIBLE_DEVICES=0,1 fairseq-train --task hierstory_bpe data-bin/writingPrompts-event2story_bpe500 \
#    --save-dir ../out/bilstm_transformer-lm_bpe500_format_pre --arch bilstm_h_transformer_lm_gpt_format \
#    --source-lang event --target-lang new  --max-tokens 4000 \
#    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
#    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
#    --criterion story_cross_entropy --weight-decay .0001 --label-smoothing 0 \
#    --tensorboard-logdir ../out/bilstm_transformer-lm_bpe500_format_pre \
#    --log-interval  1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
#    --skip-invalid-size-inputs-valid-test \
#    --pretrained  --pretrained-checkpoint data-bin/writingPrompts-event2story_bpe500/gpt2model.pytorch
CUDA_VISIBLE_DEVICES=0,1 fairseq-train --task hierstory_bpe data-bin/writingPrompts-srl2story_bpe500 \
    --save-dir ../out/bilstm_transformer-lm_bpe500_format_pre_srl --arch bilstm_h_transformer_lm_gpt_format \
    --source-lang event --target-lang new  --max-tokens 4000 \
    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
    --criterion story_cross_entropy --weight-decay .0001 --label-smoothing 0 \
    --tensorboard-logdir ../out/bilstm_transformer-lm_bpe500_format_pre_srl \
    --log-interval  1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
    --skip-invalid-size-inputs-valid-test \
    --pretrained  --pretrained-checkpoint data-bin/writingPrompts-srl2story_bpe500/gpt2model.pytorch
CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train --task hierstory_bpe data-bin/writingPrompts-srl2story_bpe500 \
    --save-dir ../out/bilstm_transformer-lm_bpe500_format_pre_srl_hier --arch bilstmhier_h_transformer_lm_gpt_format \
    --source-lang event --target-lang new  --max-tokens 2000 \
    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
    --criterion story_cross_entropy_only_event --weight-decay .0001 --label-smoothing 0 \
    --tensorboard-logdir ../out/bilstm_transformer-lm_bpe500_format_pre_srl_hier \
    --log-interval  1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
    --skip-invalid-size-inputs-valid-test \
    --pretrained  --pretrained-checkpoint ../fire_data/running/transformer-lm_bpe500_warmlr_format_pre2/checkpoint_best.pt \
    --save-interval-updates 5000
CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train --task hierstory_bpe data-bin/writingPrompts-srl2story_bpe500 \
    --save-dir ../out/bilstm_transformer-lm_bpe500_format_pre_srl_hier_pos --arch bilstmhier_h_transformer_lm_gpt_format \
    --source-lang event --target-lang new  --max-tokens 2000 \
    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
    --criterion story_cross_entropy_only_event --weight-decay .0001 --label-smoothing 0 \
    --tensorboard-logdir ../out/bilstm_transformer-lm_bpe500_format_pre_srl_hier_pos \
    --log-interval  1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
    --skip-invalid-size-inputs-valid-test \
    --pretrained  --pretrained-checkpoint ../fire_data/running/transformer-lm_bpe500_warmlr_format_pre2/checkpoint_best.pt
CUDA_VISIBLE_DEVICES=0,1 fairseq-train --task hierstory_bpe data-bin/writingPrompts-srl2story_bpe500 \
    --save-dir ../out/bilstm_transformer-lm_bpe500_format_pre_srl_hier3 --arch bilstmhier3_h_transformer_lm_gpt_format \
    --source-lang event --target-lang new  --max-tokens 2000 \
    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
    --criterion story_cross_entropy_only_event --weight-decay .0001 --label-smoothing 0 \
    --tensorboard-logdir ../out/bilstm_transformer-lm_bpe500_format_pre_srl_hier3 \
    --log-interval  1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
    --skip-invalid-size-inputs-valid-test \
    --pretrained  --pretrained-checkpoint ../out/checkpoint_best.pt
CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train --task hierstory_bpe data-bin/writingPrompts-srl2story_bpe500_fix \
    --save-dir ../out/bilstm_transformer-lm_bpe500_format_pre_srl_hier_pos_fix --arch bilstmhier_h_transformer_lm_gpt_format \
    --source-lang event --target-lang new  --max-tokens 2000 \
    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
    --criterion story_cross_entropy_only_event --weight-decay .0001 --label-smoothing 0 \
    --tensorboard-logdir ../out/bilstm_transformer-lm_bpe500_format_pre_srl_hier_pos_fix \
    --log-interval  1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
    --skip-invalid-size-inputs-valid-test \
    --pretrained  --pretrained-checkpoint ../fire_data/running/transformer-lm_bpe500_warmlr_format_pre2/checkpoint_best.pt
CUDA_VISIBLE_DEVICES=0,1 fairseq-train --task hierstory_bpe data-bin/writingPrompts-srl2story_bpe500_fix \
    --save-dir ../out/bilstm_transformer-lm_bpe500_format_pre_srl_hier4_fix --arch bilstmhier4_h_transformer_lm_gpt_format \
    --source-lang event --target-lang new  --max-tokens 2000 \
    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
    --criterion story_cross_entropy_without_event --weight-decay .0001 --label-smoothing 0 \
    --tensorboard-logdir ../out/bilstm_transformer-lm_bpe500_format_pre_srl_hier4_fix \
    --log-interval  1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
    --skip-invalid-size-inputs-valid-test \
    --pretrained  --pretrained-checkpoint ../out/checkpoint_best.pt

# story language model
#CUDA_VISIBLE_DEVICES=0,1 fairseq-train --task language_modeling_bpe data-bin/writingPrompts-event2story_bpe500 \
#  --save-dir ../out/transformer-lm_bpe500 --arch transformer_lm_gpt_bpe \
#  --optimizer adam   --lr 0.001  --max-tokens 4000 \
#  --criterion label_smoothed_cross_entropy --weight-decay .0001 --label-smoothing 0 \
#  --tensorboard-logdir ../out/transformer-lm_bpe500  \
#  --log-interval  1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
#  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos
#CUDA_VISIBLE_DEVICES=0,1 fairseq-train --task language_modeling_bpe data-bin/writingPrompts-event2story_bpe500 \
#  --save-dir ../out/transformer-lm_bpe500_warmlr --arch transformer_lm_gpt_bpe \
#  --max-tokens 4000 \
#  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
#  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
#  --criterion label_smoothed_cross_entropy --weight-decay .0001 --label-smoothing 0 \
#  --tensorboard-logdir ../out/transformer-lm_bpe500_warmlr  \
#  --log-interval  1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
#  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos
#CUDA_VISIBLE_DEVICES=0,1 fairseq-train --task language_modeling_bpe data-bin/writingPrompts-event2story_bpe500 \
##  --save-dir ../out/transformer-lm_bpe500_warmlr_format --arch transformer_lm_gpt_bpe_format \
##  --max-tokens 4000 \
##  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
##  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
##  --criterion label_smoothed_cross_entropy --weight-decay .0001 --label-smoothing 0 \
##  --tensorboard-logdir ../out/transformer-lm_bpe500_warmlr_format  \
##  --log-interval  1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
##  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos
##CUDA_VISIBLE_DEVICES=0,1 fairseq-train --task language_modeling_bpe data-bin/writingPrompts-event2story_bpe500 \
##  --save-dir ../out/transformer-lm_bpe500_warmlr_format_pre --arch transformer_lm_gpt_bpe_format \
##  --max-tokens 4000 \
##  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
##  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
##  --criterion label_smoothed_cross_entropy --weight-decay .0001 --label-smoothing 0 \
##  --tensorboard-logdir ../out/transformer-lm_bpe500_warmlr_format_pre  \
##  --log-interval  1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
##  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
##  --pretrained  --pretrained-checkpoint data-bin/writingPrompts-event2story_bpe500/gpt2model.pytorch
#CUDA_VISIBLE_DEVICES=0,1 fairseq-train --task language_modeling_bpe data-bin/writingPrompts-event2story_bpe500 \
#  --save-dir ../out/transformer-lm_bpe500_warmlr_format2 --arch transformer_lm_gpt_bpe_format \
#  --max-tokens 4000 \
#  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
#  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
#  --criterion story_cross_entropy_without_event --weight-decay .0001 --label-smoothing 0 \
#  --tensorboard-logdir ../out/transformer-lm_bpe500_warmlr_format2  \
#  --log-interval  1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
#  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos
#CUDA_VISIBLE_DEVICES=0,1 fairseq-train --task language_modeling_bpe data-bin/writingPrompts-event2story_bpe500 \
#  --save-dir ../out/transformer-lm_bpe500_warmlr_format_pre2 --arch transformer_lm_gpt_bpe_format \
#  --max-tokens 4000 \
#  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
#  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
#  --criterion story_cross_entropy_without_event --weight-decay .0001 --label-smoothing 0 \
#  --tensorboard-logdir ../out/transformer-lm_bpe500_warmlr_format_pre2  \
#  --log-interval  1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
#  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
#  --pretrained  --pretrained-checkpoint data-bin/writingPrompts-event2story_bpe500/gpt2model.pytorch
#CUDA_VISIBLE_DEVICES=0,1 fairseq-train --task language_modeling_bpe data-bin/writingPrompts-srl2story_bpe500 \
#  --save-dir ../out/transformer-lm_bpe500_warmlr_format_pre2_srl --arch transformer_lm_gpt_bpe_format \
#  --max-tokens 4000 \
#  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
#  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
#  --criterion story_cross_entropy_without_event --weight-decay .0001 --label-smoothing 0 \
#  --tensorboard-logdir ../out/transformer-lm_bpe500_warmlr_format_pre2_srl  \
#  --log-interval  1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
#  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
#  --pretrained  --pretrained-checkpoint data-bin/writingPrompts-srl2story_bpe500/gpt2model.pytorch
CUDA_VISIBLE_DEVICES=0,1 python train.py --user-dir coherence_story --task language_modeling_bpe data-bin/writingPrompts-promptstory_bpe500 \
  --save-dir ../out/transformer-lm_bpe500_warmlr_format_pre2_absstory --arch transformer_lm_gpt_bpe_format \
  --max-tokens 4000 --data-sufix .new-None.new --truncate \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../out/transformer-lm_bpe500_warmlr_format_pre2_absstory  \
  --log-interval  1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint data-bin/gpt2model.pytorch

CUDA_VISIBLE_DEVICES=0,1 python train.py --user-dir coherence_story --task language_modeling_bpe_coref data-bin/writingPrompts-event2story_bpe500 \
  --save-dir ../shared_info/rickwwang_ckpt/transformer-lm_bpe500_warmlr_format_pre2_coref --arch transformer_lm_gpt_bpe_format \
  --max-tokens 4000 --data-sufix .event-new.new --entity-weight 0.3 --use-attn \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event_coref --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../shared_info/rickwwang_ckpt/transformer-lm_bpe500_warmlr_format_pre2_coref  \
  --log-interval  1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint data-bin/gpt2model.pytorch
CUDA_VISIBLE_DEVICES=0,1 python train.py --user-dir coherence_story --task language_modeling_bpe_coref data-bin/writingPrompts-event2story_bpe500 \
  --save-dir ../shared_info/rickwwang_ckpt/transformer-lm_bpe500_warmlr_format_pre2_coref_onehead --arch transformer_lm_gpt_bpe_format \
  --max-tokens 4000 --data-sufix .event-new.new --entity-weight 0.3 --use-attn \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event_coref --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../shared_info/rickwwang_ckpt/transformer-lm_bpe500_warmlr_format_pre2_coref_onehead  \
  --log-interval  1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint data-bin/gpt2model.pytorch

CUDA_VISIBLE_DEVICES=0,1 python train.py --user-dir coherence_story --task language_modeling_bpe_coref data-bin/writingPrompts-event2story_bpe500 \
  --save-dir ../shared_info/rickwwang_ckpt/transformer-lm_bpe500_warmlr_format_pre2_coref_01s --arch transformer_lm_gpt_bpe_format \
  --max-tokens 4000 --data-sufix .event-new.new --entity-weight 0.1 --use-attn \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event_coref_fix --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../shared_info/rickwwang_ckpt/transformer-lm_bpe500_warmlr_format_pre2_coref_01s \
  --log-interval  1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint data-bin/gpt2model.pytorch

# test speed, orgin: shared_info/rickwwang_ckpt
CUDA_VISIBLE_DEVICES=2,3 python train.py --user-dir coherence_story --task language_modeling_bpe_coref data-bin/writingPrompts-event2story_bpe500 \
  --save-dir ../log/transformer-lm_bpe500_warmlr_format_pre2_coref_01s_test --arch transformer_lm_gpt_bpe_format \
  --max-tokens 4000 --data-sufix .event-new.new --entity-weight 0.1 --use-attn \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event_coref_fix --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../log/transformer-lm_bpe500_warmlr_format_pre2_coref_01s_test \
  --log-interval  1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint data-bin/gpt2model.pytorch
CUDA_VISIBLE_DEVICES=2,3 python train.py --user-dir coherence_story --task language_modeling_bpe_coref ~/fairseq/data-bin/writingPrompts-event2story_bpe500 \
  --save-dir ../shared_info/rickwwang_ckpt/transformer-lm_bpe500_warmlr_format_pre2_coref_01s_test --arch transformer_lm_gpt_bpe_format \
  --max-tokens 4000 --data-sufix .event-new.new --entity-weight 0.1 --use-attn \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event_coref_fix --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../shared_info/rickwwang_ckpt/transformer-lm_bpe500_warmlr_format_pre2_coref_01s_test \
  --log-interval  1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint data-bin/gpt2model.pytorch
CUDA_VISIBLE_DEVICES=2,3 python train.py --user-dir coherence_story --task language_modeling_bpe_coref /apdcephfs/share_916081/shared_info/rickwwang_ckpt/fairseq/data-bin/writingPrompts-event2story_bpe500 \
  --save-dir ../log/transformer-lm_bpe500_warmlr_format_pre2_coref_01s_test --arch transformer_lm_gpt_bpe_format \
  --max-tokens 4000 --data-sufix .event-new.new --entity-weight 0.1 --use-attn \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event_coref_fix --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../log/transformer-lm_bpe500_warmlr_format_pre2_coref_01s_test \
  --log-interval  1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint data-bin/gpt2model.pytorch

CUDA_VISIBLE_DEVICES=0,1 python train.py --user-dir coherence_story --task language_modeling_bpe_discourse data-bin/writingPrompts-promptdstory_discourse_bpe500  \
  --save-dir ../shared_info/rickwwang_ckpt/promptdstory_discourse --arch transformer_lm_bpe_gpt_format_discourse \
  --max-tokens 4000 --data-sufix .promptdstory-None.promptdstory --truncate --entity-weight 0.3  \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event_discourse --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../shared_info/rickwwang_ckpt/promptdstory_discourse  \
  --log-interval  1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint data-bin/gpt2model.pytorch
CUDA_VISIBLE_DEVICES=0,1 python train.py --user-dir coherence_story --task language_modeling_bpe_discourse data-bin/writingPrompts-promptdstory_discourse_bpe500  \
  --save-dir ../shared_info/rickwwang_ckpt/promptdstory_discourse_two --arch transformer_lm_bpe_gpt_format_discourse \
  --max-tokens 4000 --data-sufix .promptdstory-None.promptdstory --truncate --entity-weight 0.3  \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event_discourse_two --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../shared_info/rickwwang_ckpt/promptdstory_discourse_two  \
  --log-interval  1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint data-bin/gpt2model.pytorch
CUDA_VISIBLE_DEVICES=0,1 python train.py --user-dir coherence_story --task language_modeling_bpe_discourse data-bin/writingPrompts-promptdstory_discourse_bpe500  \
  --save-dir ../shared_info/rickwwang_ckpt/promptdstory_discourse_two_01 --arch transformer_lm_bpe_gpt_format_discourse \
  --max-tokens 4000 --data-sufix .promptdstory-None.promptdstory --truncate --entity-weight 0.1  \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event_discourse_two --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../shared_info/rickwwang_ckpt/promptdstory_discourse_two_01  \
  --log-interval  1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint data-bin/gpt2model.pytorch
CUDA_VISIBLE_DEVICES=0,1 python train.py --user-dir coherence_story --task language_modeling_bpe_discourse data-bin/writingPrompts-promptdstory_discourse_bpe500  \
  --save-dir ../shared_info/rickwwang_ckpt/promptdstory_discourse_two_05 --arch transformer_lm_bpe_gpt_format_discourse \
  --max-tokens 4000 --data-sufix .promptdstory-None.promptdstory --truncate --entity-weight 0.5  \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event_discourse_two --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../shared_info/rickwwang_ckpt/promptdstory_discourse_two_05  \
  --log-interval  1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint data-bin/gpt2model.pytorch

# test speed
CUDA_VISIBLE_DEVICES=0,1 python train.py --user-dir coherence_story --task language_modeling_bpe_discourse data-bin/writingPrompts-promptdstory_discourse_bpe500  \
  --save-dir ../shared_info/rickwwang_ckpt/promptdstory_discourse_two_01_test --arch transformer_lm_bpe_gpt_format_discourse \
  --max-tokens 4000 --data-sufix .promptdstory-None.promptdstory --truncate --entity-weight 0.1  \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event_discourse_two --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../shared_info/rickwwang_ckpt/promptdstory_discourse_two_01_test  \
  --log-interval  1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint data-bin/gpt2model.pytorch

# discourse and coreference
CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --user-dir coherence_story --task language_modeling_bpe_coref_dis data-bin/writingPrompts-promptdstory_coref_discourse_bpe500 \
  --save-dir /apdcephfs/share_916081/shared_info/rickwwang_ckpt/log/promptstory_coref_dis_1010 --arch transformer_lm_bpe_gpt_format_discourse \
  --max-tokens 4000 --data-sufix .promptdstory-None.promptdstory --truncate \
  --coref-weight 0.1 --use-attn --dis-weight 0.1 \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event_coref_dis_fix --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir /apdcephfs/share_916081/shared_info/rickwwang_ckpt/log/promptstory_coref_dis_1010  \
  --log-interval  1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint data-bin/gpt2model.pytorch

CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --user-dir coherence_story --task language_modeling_bpe_coref_dis data-bin/writingPrompts-promptdstory_coref_discourse_bpe500 \
  --save-dir /apdcephfs/share_916081/shared_info/rickwwang_ckpt/log/promptstory_coref_dis_3010 --arch transformer_lm_bpe_gpt_format_discourse \
  --max-tokens 4000 --data-sufix .promptdstory-None.promptdstory --truncate \
  --coref-weight 0.3 --use-attn --dis-weight 0.1 \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event_coref_dis_fix --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir /apdcephfs/share_916081/shared_info/rickwwang_ckpt/log/promptstory_coref_dis_3010  \
  --log-interval  1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint data-bin/gpt2model.pytorch

CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --user-dir coherence_story --task language_modeling_bpe_coref_dis data-bin/writingPrompts-promptdstory_coref_discourse_bpe500 \
  --save-dir /apdcephfs/share_916081/shared_info/rickwwang_ckpt/log/promptstory_coref_dis_5010 --arch transformer_lm_bpe_gpt_format_discourse \
  --max-tokens 4000 --data-sufix .promptdstory-None.promptdstory --truncate \
  --coref-weight 0.5 --use-attn --dis-weight 0.1 \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event_coref_dis_fix --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir /apdcephfs/share_916081/shared_info/rickwwang_ckpt/log/promptstory_coref_dis_5010  \
  --log-interval  1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint data-bin/gpt2model.pytorch


# discourse lm
CUDA_VISIBLE_DEVICES=0,1 python train.py --user-dir coherence_story --task language_modeling_bpe data-bin/writingPrompts-promptdstory_bpe500 \
  --save-dir ../shared_info/rickwwang_ckpt/promptdstory --arch transformer_lm_gpt_bpe_format \
  --max-tokens 4000 --data-sufix .promptdstory-None.promptdstory --truncate \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../shared_info/rickwwang_ckpt/promptdstory \
  --log-interval  1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint data-bin/gpt2model.pytorch

# promp2allen lm
CUDA_VISIBLE_DEVICES=0,1 python train.py --user-dir coherence_story --task language_modeling_bpe data-bin/writingPrompts-prompt2allen_bpe500 \
  --save-dir ../out/prompt2allen --arch transformer_lm_gpt_bpe_format \
  --max-tokens 4000 --data-sufix .promptevent-None.promptevent --truncate \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../out/prompt2allen  \
  --log-interval  1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos

# prompt2abs lm
CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --user-dir coherence_story --task language_modeling_bpe data-bin/writingPrompts-prompt2abs_bpe500 \
  --save-dir ../shared_info/04938358e70a_ckpt/prompt2abs --arch transformer_lm_gpt_bpe_format \
  --max-tokens 4000 --data-sufix .promptabs-None.promptabs --truncate \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../shared_info/04938358e70a_ckpt/prompt2abs  \
  --log-interval  1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos
CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --user-dir coherence_story --task language_modeling_bpe data-bin/writingPrompts-prompt2abs_bpe500 \
  --save-dir ../out/prompt2abs_memory_gate --arch transformer_lm_bpe_memory_gate_gpt_format \
  --max-tokens 4000 --data-sufix .promptabs-None.promptabs --truncate \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../out/prompt2abs_memory_gate  \
  --log-interval  1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos
CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --user-dir coherence_story --task language_modeling_bpe data-bin/writingPrompts-absprompt_bpe500 \
  --save-dir ../out/absprompt --arch transformer_lm_gpt_bpe_format \
  --max-tokens 4000 --data-sufix .absprompt-None.absprompt --truncate --mask-context True \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../out/absprompt  \
  --log-interval  1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint data-bin/gpt2model.pytorch
CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --user-dir coherence_story --task language_modeling_bpe data-bin/writingPrompts-prompt2abs_bpe500 \
  --save-dir ../out/prompt2abs_pre --arch transformer_lm_gpt_bpe_format \
  --max-tokens 4000 --data-sufix .promptabs-None.promptabs --truncate \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../out/prompt2abs_pre  \
  --log-interval  1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint data-bin/gpt2model.pytorch

# prompt2keyword lm
CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --user-dir coherence_story --task language_modeling_bpe data-bin/writingPrompts-prompt2keyword_bpe500 \
  --save-dir ../shared_info/rickwwang_ckpt/prompt2keyword --arch transformer_lm_gpt_bpe_format \
  --max-tokens 4000 --data-sufix .promptkeyword-None.promptkeyword --truncate \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../shared_info/rickwwang_ckpt/prompt2keyword  \
  --log-interval  1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos

# absstory lm
CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --user-dir coherence_story --task language_modeling_bpe data-bin/writingPrompts-absstory_bpe500 \
  --save-dir ../shared_info/rickwwang_ckpt/absstory --arch transformer_lm_gpt_bpe_format \
  --max-tokens 4000 --data-sufix .absstory-None.absstory \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../shared_info/rickwwang_ckpt/absstory  \
  --log-interval  1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos

# promptabsstory lm
CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --user-dir coherence_story --task language_modeling_bpe data-bin/writingPrompts-promptabsstory_bpe500 \
  --save-dir ../out/promptabsstory --arch transformer_lm_gpt_bpe_format \
  --max-tokens 4000 --data-sufix .promptabsstory-None.promptabsstory \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../out/promptabsstory  \
  --log-interval  1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos
CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --user-dir coherence_story --task language_modeling_bpe data-bin/writingPrompts-promptabsstory_bpe500 \
  --save-dir ../out/promptabsstory_mask_abs --arch transformer_lm_gpt_bpe_format \
  --max-tokens 4000 --data-sufix .promptabsstory-None.promptabsstory --truncate --mask-mid True \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../out/promptabsstory_mask_abs  \
  --log-interval  1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint data-bin/gpt2model.pytorch
CUDA_VISIBLE_DEVICES=0,1 python train.py --user-dir coherence_story --task language_modeling_bpe data-bin/writingPrompts-promptabsstory_bpe500 \
  --save-dir ../shared_info/rickwwang_ckpt/promptabsstory_mask_abs_015_abs --arch transformer_lm_gpt_bpe_format \
  --max-tokens 4000 --data-sufix .promptabsstory-None.promptabsstory --truncate --mask-mid True --mask-input True \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../shared_info/rickwwang_ckpt/promptabsstory_mask_abs_015_abs  \
  --log-interval  1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint data-bin/gpt2model.pytorch
CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --user-dir coherence_story --task language_modeling_bpe data-bin/writingPrompts-promptabsstory_bpe500 \
  --save-dir ../shared_info/rickwwang_ckpt/promptabsstory_mask_abs_050_abs --arch transformer_lm_gpt_bpe_format \
  --max-tokens 4000 --data-sufix .promptabsstory-None.promptabsstory --truncate --mask-mid True --mask-input True \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../shared_info/rickwwang_ckpt/promptabsstory_mask_abs_050_abs  \
  --log-interval  1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint data-bin/gpt2model.pytorch

CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --user-dir coherence_story --task language_modeling_bpe_coref_dis data-bin/writingPrompts-prompabsdstory_coref_discourse_bpe500 \
  --save-dir ../shared_info/rickwwang_ckpt/promptabsstory_mask_abs_coref_dis --arch transformer_lm_bpe_gpt_format_discourse \
  --max-tokens 4000 --data-sufix .promptabsdstory-None.promptabsdstory --truncate --mask-mid True \
  --coref-weight 0.3 --use-attn --dis-weight 0.3 \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event_coref_dis --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../shared_info/rickwwang_ckpt/promptabsstory_mask_abs_coref_dis  \
  --log-interval  1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint data-bin/gpt2model.pytorch

CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --user-dir coherence_story --task language_modeling_bpe_coref_dis data-bin/writingPrompts-prompabsdstory_coref_discourse_bpe500 \
  --save-dir ../fire_data/running/promptabsstory_mask_abs_coref_dis_015s --arch transformer_lm_bpe_gpt_format_discourse \
  --max-tokens 4000 --data-sufix .promptabsdstory-None.promptabsdstory --truncate --mask-mid True --mask-input True \
  --coref-weight 0.3 --use-attn --dis-weight 0.3 \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event_coref_dis_fix --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../fire_data/running/promptabsstory_mask_abs_coref_dis_015s  \
  --log-interval  1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint data-bin/gpt2model.pytorch
CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --user-dir coherence_story --task language_modeling_bpe_coref_dis data-bin/writingPrompts-prompabsdstory_coref_discourse_bpe500 \
  --save-dir ../shared_info/rickwwang_ckpt/promptabsstory_mask_abs_coref_dis_015s --arch transformer_lm_bpe_gpt_format_discourse \
  --max-tokens 4000 --data-sufix .promptabsdstory-None.promptabsdstory --truncate --mask-mid True --mask-input True \
  --coref-weight 0.3 --use-attn --dis-weight 0.3 \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event_coref_dis_fix --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../shared_info/rickwwang_ckpt/promptabsstory_mask_abs_coref_dis_015s  \
  --log-interval  1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint data-bin/gpt2model.pytorch

CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --user-dir coherence_story --task language_modeling_bpe data-bin/writingPrompts-promptabsstory_bpe500 \
  --save-dir ../shared_info/rickwwang_ckpt/promptabsstory_mask_abs_015s_abs_wopre --arch transformer_lm_gpt_bpe_format \
  --max-tokens 4000 --data-sufix .promptabsstory-None.promptabsstory --truncate --mask-mid True --mask-input True \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../shared_info/rickwwang_ckpt/promptabsstory_mask_abs_015s_abs_wopre  \
  --log-interval  1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos

CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --user-dir coherence_story --task language_modeling_bpe_coref_dis data-bin/writingPrompts-prompabsdstory_coref_discourse_bpe500 \
  --save-dir ../shared_info/rickwwang_ckpt/promptabsstory_mask_abs_coref_dis_015s_3005 --arch transformer_lm_bpe_gpt_format_discourse \
  --max-tokens 4000 --data-sufix .promptabsdstory-None.promptabsdstory --truncate --mask-mid True --mask-input True \
  --coref-weight 0.3 --use-attn --dis-weight 0.05 \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event_coref_dis_fix --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../shared_info/rickwwang_ckpt/promptabsstory_mask_abs_coref_dis_015s_3005  \
  --log-interval  1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint data-bin/gpt2model.pytorch
CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --user-dir coherence_story --task language_modeling_bpe_coref_dis data-bin/writingPrompts-prompabsdstory_coref_discourse_bpe500 \
  --save-dir ../shared_info/rickwwang_ckpt/promptabsstory_mask_abs_coref_dis_015s_3010 --arch transformer_lm_bpe_gpt_format_discourse \
  --max-tokens 4000 --data-sufix .promptabsdstory-None.promptabsdstory --truncate --mask-mid True --mask-input True \
  --coref-weight 0.3 --use-attn --dis-weight 0.1 \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event_coref_dis_fix --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../shared_info/rickwwang_ckpt/promptabsstory_mask_abs_coref_dis_015s_3010  \
  --log-interval  1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint data-bin/gpt2model.pytorch

CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --user-dir coherence_story --task language_modeling_bpe_coref_dis data-bin/writingPrompts-prompabsdstory_coref_discourse_bpe500 \
  --save-dir ../fire_data/running/promptabsstory_mask_abs_coref_dis_015s_3010_fine --arch transformer_lm_bpe_gpt_format_discourse \
  --max-tokens 4000 --data-sufix .promptabsdstory-None.promptabsdstory --truncate --mask-mid True --mask-input True \
  --coref-weight 0.3 --use-attn --dis-weight 0.1 \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event_coref_dis_fix --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../fire_data/running/promptabsstory_mask_abs_coref_dis_015s_3010_fine  \
  --log-interval  1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint ../fire_data/running/promptabsstory_mask_abs_015_abs/checkpoint_best.pt

# promptkeywordstory lm
CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --user-dir coherence_story --task language_modeling_bpe data-bin/writingPrompts-promptkeywordstory_bpe500 \
  --save-dir ../fire_data/running/promptkeywordstory --arch transformer_lm_gpt_bpe_format \
  --max-tokens 4000 --data-sufix .promptkeywordstory-None.promptkeywordstory \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../fire_data/running/promptkeywordstory  \
  --log-interval  1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos
CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --user-dir coherence_story --task language_modeling_bpe data-bin/writingPrompts-promptkeywordstory_bpe500 \
  --save-dir ../out/promptkeywordstory_mask_keyword --arch transformer_lm_gpt_bpe_format \
  --max-tokens 4000 --data-sufix .promptkeywordstory-None.promptkeywordstory --truncate --mask-mid True \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../out/promptkeywordstory_mask_keyword  \
  --log-interval  1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint data-bin/gpt2model.pytorch

# allen2story trans
CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --user-dir coherence_story --task translation_bpe data-bin/writingPrompts-allen2story_bpe500 \
    --save-dir ../task_out/allen2story --arch transformer_gpt_bpe_format \
    --source-lang promptevent --target-lang new --max-tokens 3000  \
    --src-dict-type bpe --tgt-dict-type bpe --truncate \
    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
    --criterion story_cross_entropy_without_event --weight-decay .0001 --label-smoothing 0 \
    --tensorboard-logdir ../task_out/allen2story \
    --log-interval 1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
    --skip-invalid-size-inputs-valid-test
CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --user-dir coherence_story --task translation_bpe data-bin/writingPrompts-allen2story_bpe500 \
    --save-dir ../shared_info/rickwwang_ckpt/allen2story_small_ls0 --arch transformer_prompt_to_event_bpe \
    --source-lang promptevent --target-lang new --max-tokens 3000  \
    --src-dict-type bpe --tgt-dict-type bpe --truncate --left-pad-source False \
    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
    --criterion story_cross_entropy_without_event --weight-decay .0001 --label-smoothing 0 \
    --tensorboard-logdir ../shared_info/rickwwang_ckpt/allen2story_small_ls0 \
    --log-interval 1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
    --skip-invalid-size-inputs-valid-test

CUDA_VISIBLE_DEVICES=0,1 python train.py --user-dir coherence_story --task translation_bpe_event data-bin/writingPrompts-allen22story_bpe500 \
    --save-dir ../shared_info/rickwwang_ckpt/allen22story_small_ls0 --arch transformer_prompt_to_event_bpe \
    --source-lang promptevent --target-lang new --max-tokens 3000  \
    --src-dict-type bpe --tgt-dict-type bpe --truncate --left-pad-source False \
    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
    --criterion story_cross_entropy_without_event --weight-decay .0001 --label-smoothing 0 \
    --tensorboard-logdir ../shared_info/rickwwang_ckpt/allen22story_small_ls0 \
    --log-interval 1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
    --skip-invalid-size-inputs-valid-test

CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --user-dir coherence_story --task translation_bpe_event data-bin/writingPrompts-allen22promptstory_bpe500 \
    --save-dir ../out/allen22promptstory --arch transformer_gpt_bpe_format \
    --source-lang event --target-lang new --max-tokens 2000  \
    --src-dict-type word --tgt-dict-type bpe --truncate --left-pad-source False \
    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
    --criterion story_cross_entropy_without_event --weight-decay .0001 --label-smoothing 0 \
    --tensorboard-logdir ../out/allen22promptstory \
    --log-interval 1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
    --skip-invalid-size-inputs-valid-test

# abs2story trans
CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --user-dir coherence_story --task translation_bpe data-bin/writingPrompts-abs2story_bpe500 \
    --save-dir ../shared_info/rickwwang_ckpt/abs2story_small_ls0 --arch transformer_prompt_to_event_bpe \
    --source-lang promptabs --target-lang new --max-tokens 4000  \
    --src-dict-type bpe --tgt-dict-type bpe --truncate --left-pad-source False \
    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
    --criterion story_cross_entropy_without_event --weight-decay .0001 --label-smoothing 0 \
    --tensorboard-logdir ../shared_info/rickwwang_ckpt/abs2story_small_ls0 \
    --log-interval 1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
    --skip-invalid-size-inputs-valid-test

CUDA_VISIBLE_DEVICES=0,1,2,3 python train.py --user-dir coherence_story --task translation_bpe_context data-bin/writingPrompts-abs2promptstory_bpe500 \
    --save-dir ../shared_info/rickwwang_ckpt/abs2promptstory_small_ls0 --arch transformer_gpt_bpe_format \
    --source-lang abs --target-lang new --max-tokens 4000  \
    --src-dict-type bpe --tgt-dict-type bpe --truncate --left-pad-source False \
    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
    --criterion story_cross_entropy_without_event --weight-decay .0001 --label-smoothing 0 \
    --tensorboard-logdir ../shared_info/rickwwang_ckpt/abs2promptstory_small_ls0 \
    --log-interval 1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
    --skip-invalid-size-inputs-valid-test

CUDA_VISIBLE_DEVICES=0,1 python train.py --user-dir coherence_story --task translation_bpe data-bin/writingPrompts-abs2story_bpe500 \
    --save-dir ../shared_info/rickwwang_ckpt/abs2story_small --arch transformer_prompt_to_event_bpe \
    --source-lang promptabs --target-lang new --max-tokens 4000  \
    --src-dict-type bpe --tgt-dict-type bpe --truncate --left-pad-source False \
    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
    --criterion story_cross_entropy_without_event --weight-decay .0001 --label-smoothing 0.1 \
    --tensorboard-logdir ../shared_info/rickwwang_ckpt/abs2story_small \
    --log-interval 1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
    --skip-invalid-size-inputs-valid-test

# test model by prompt2story_bpe500
CUDA_VISIBLE_DEVICES=0,1 python train.py --user-dir coherence_story --task translation_bpe data-bin/writingPrompts-prompt2story_bpe500_fix \
    --save-dir ../out/prompt2story_bpe500 --arch transformer_prompt_to_event_bpe \
    --source-lang prompt --target-lang story --max-tokens 4000 \
    --src-dict-type bpe --tgt-dict-type bpe --truncate --left-pad-source False \
    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
    --criterion story_cross_entropy_without_event --weight-decay .0001 --label-smoothing 0.1 \
    --tensorboard-logdir ../out/prompt2story_bpe500 \
    --log-interval 1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
    --skip-invalid-size-inputs-valid-test


# own method
CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train --task language_modeling_bpe data-bin/writingPrompts-event2story_bpe500 \
  --save-dir ../out/transformer-lm_bpe500_warmlr_format_pre2_memory --arch transformer_lm_bpe_memory_gpt_format \
  --max-tokens 4000 \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../out/transformer-lm_bpe500_warmlr_format_pre2_memory  \
  --log-interval  1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint ../fire_data/running/transformer-lm_bpe500_warmlr_format_pre2/checkpoint_best.pt
CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train --task language_modeling_bpe data-bin/writingPrompts-event2story_bpe500 \
  --save-dir ../out/transformer-lm_bpe500_warmlr_format_pre2_memory_gpt --arch transformer_lm_bpe_memory_gpt_format \
  --max-tokens 4000 \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../out/transformer-lm_bpe500_warmlr_format_pre2_memory_gpt  \
  --log-interval  1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint data-bin/writingPrompts-event2story_bpe500/gpt2model.pytorch
CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train --task language_modeling_bpe data-bin/writingPrompts-event2story_bpe500 \
  --save-dir ../out/transformer-lm_bpe500_warmlr_format_pre2_memory_gate_gpt --arch transformer_lm_bpe_memory_gate_gpt_format \
  --max-tokens 4000 \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../out/transformer-lm_bpe500_warmlr_format_pre2_memory_gate_gpt  \
  --log-interval  1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint data-bin/writingPrompts-event2story_bpe500/gpt2model.pytorch
CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train --task language_modeling_bpe data-bin/writingPrompts-event2story_bpe500 \
  --save-dir ../out/transformer-lm_bpe500_warmlr_format_pre2_coherence_gpt --arch transformer_lm_bpe_coherence_gpt_format \
  --max-tokens 4000 \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event_coherence --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../out/transformer-lm_bpe500_warmlr_format_pre2_coherence_gpt  \
  --log-interval  1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint data-bin/writingPrompts-event2story_bpe500/gpt2model.pytorch
CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train --task language_modeling_bpe data-bin/writingPrompts-event2story_bpe500 \
  --save-dir ../out/transformer-lm_bpe500_warmlr_format_pre2_memory_gate_coherence_gpt --arch transformer_lm_bpe_memory_gate_coherence_gpt_format \
  --max-tokens 4000 \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event_coherence --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../out/transformer-lm_bpe500_warmlr_format_pre2_memory_gate_coherence_gpt  \
  --log-interval  1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint data-bin/writingPrompts-event2story_bpe500/gpt2model.pytorch

CUDA_VISIBLE_DEVICES=5,6,7 fairseq-train --user-dir coherence_story --task language_modeling_bpe data-bin/writingPrompts-event2story_bpe500 \
  --save-dir ../out/transformer-lm_bpe500_warmlr_format_pre2_memory_gate_relevance_gpt --arch transformer_lm_bpe_memory_gate_relevance_gpt_format \
  --max-tokens 4000 \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event_relevance --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../out/transformer-lm_bpe500_warmlr_format_pre2_memory_gate_relevance_gpt  \
  --log-interval  1000 --log-format simple --distributed-world-size 3 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos  --use-context True \
  --pretrained  --pretrained-checkpoint data-bin/writingPrompts-event2story_bpe500/gpt2model.pytorch
CUDA_VISIBLE_DEVICES=5,6,7 fairseq-train --user-dir coherence_story --task language_modeling_bpe data-bin/writingPrompts-event2story_bpe500 \
  --save-dir ../out/transformer-lm_bpe500_warmlr_format_pre2_memory_gate_relevance_gpt_fix --arch transformer_lm_bpe_memory_gate_relevance_gpt_format \
  --max-tokens 4000 \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event_relevance --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../out/transformer-lm_bpe500_warmlr_format_pre2_memory_gate_relevance_gpt_fix  \
  --log-interval  1000 --log-format simple --distributed-world-size 3 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos  --use-context True \
  --pretrained  --pretrained-checkpoint ../out/transformer-lm_bpe500_warmlr_format_pre2_memory_gate_relevance_gpt_fix/checkpoint_best_lm_ga.pt

CUDA_VISIBLE_DEVICES=5,6,7 fairseq-train --user-dir coherence_story --task language_modeling_bpe data-bin/writingPrompts-event2story_bpe500 \
  --save-dir ../out/transformer-lm_bpe500_warmlr_format_pre2_memory_gate_relevance_gpt_fix_mlp --arch transformer_lm_bpe_memory_gate_relevance_gpt_format \
  --max-tokens 4000 \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_cross_entropy_without_event_relevance --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../out/transformer-lm_bpe500_warmlr_format_pre2_memory_gate_relevance_gpt_fix_mlp  \
  --log-interval  1000 --log-format simple --distributed-world-size 3 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos  --use-context True \
  --pretrained  --pretrained-checkpoint ../out/transformer-lm_bpe500_warmlr_format_pre2_memory_gate_relevance_gpt_fix/checkpoint_best_lm_ga.pt

# coherence dis
CUDA_VISIBLE_DEVICES=6,7 fairseq-train --user-dir coherence_story --task language_modeling_bpe data-bin/writingPrompts-event2story_bpe500 \
  --save-dir ../out/transformer-lm_bpe500_warmlr_format_pre2_coherence_dis --arch transformer_lm_bpe_coherence_dis_gpt_format \
  --max-tokens 4000 \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_coherence --weight-decay .0001  \
  --tensorboard-logdir ../out/transformer-lm_bpe500_warmlr_format_pre2_coherence_dis  \
  --log-interval  1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint ../fire_data/running/transformer-lm_bpe500_warmlr_format_pre2/checkpoint_best.pt

CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train --user-dir coherence_story --task language_modeling_bpe data-bin/writingPrompts-event2story_bpe500 \
  --save-dir ../out/transformer-lm_bpe500_warmlr_format_pre2_coherence_dis_enhance --arch transformer_lm_bpe_coherence_dise_gpt_format \
  --max-tokens 4000 \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_coherence_enhance --weight-decay .0001  \
  --tensorboard-logdir ../out/transformer-lm_bpe500_warmlr_format_pre2_coherence_dis_enhance  \
  --log-interval  1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint ../fire_data/running/transformer-lm_bpe500_warmlr_format_pre2/checkpoint_best.pt
CUDA_VISIBLE_DEVICES=0,1 python train.py --user-dir coherence_story --task language_modeling_bpe data-bin/writingPrompts-event2story_bpe500 \
  --save-dir ../out/transformer-lm_bpe500_warmlr_format_pre2_coherence_dis_enhance_no4 --arch transformer_lm_bpe_coherence_dise_gpt_format \
  --max-tokens 4000 \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_coherence_enhance --weight-decay .0001  \
  --tensorboard-logdir ../out/transformer-lm_bpe500_warmlr_format_pre2_coherence_dis_enhance_no4  \
  --log-interval  1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint ../fire_data/running/transformer-lm_bpe500_warmlr_format_pre2/checkpoint_best.pt
CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train --user-dir coherence_story --task language_modeling_bpe data-bin/writingPrompts-event2story_bpe500 \
  --save-dir ../out/transformer-lm_bpe500_warmlr_format_pre2_coherence_dis_enhance_no43 --arch transformer_lm_bpe_coherence_dise_gpt_format \
  --max-tokens 4000 \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_coherence_enhance --weight-decay .0001  \
  --tensorboard-logdir ../out/transformer-lm_bpe500_warmlr_format_pre2_coherence_dis_enhance_no43  \
  --log-interval  1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint ../fire_data/running/transformer-lm_bpe500_warmlr_format_pre2/checkpoint_best.pt
CUDA_VISIBLE_DEVICES=0,1 python train.py --user-dir coherence_story --task language_modeling_bpe data-bin/writingPrompts-event2story_bpe500 \
  --save-dir ../out/transformer-lm_bpe500_warmlr_format_pre2_coherence_dis_enhance_fix --arch transformer_lm_bpe_coherence_disef_gpt_format \
  --max-tokens 4000 \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_coherence_enhance_fix --weight-decay .0001  \
  --tensorboard-logdir ../out/transformer-lm_bpe500_warmlr_format_pre2_coherence_dis_enhance_fix  \
  --log-interval  1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint ../out/transformer-lm_bpe500_warmlr_format_pre2/checkpoint_best.pt
CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train --user-dir coherence_story --task language_modeling_bpe data-bin/writingPrompts-event2story_bpe500 \
  --save-dir ../out/transformer-lm_bpe500_warmlr_format_pre2_coherence_dis_article --arch transformer_lm_bpe_coherence_disa_gpt_format \
  --max-tokens 4000 \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_coherence_article --weight-decay .0001  \
  --tensorboard-logdir ../out/transformer-lm_bpe500_warmlr_format_pre2_coherence_dis_article  \
  --log-interval  1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint ../fire_data/running/transformer-lm_bpe500_warmlr_format_pre2/checkpoint_best.pt
CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train --user-dir coherence_story --task language_modeling_bpe data-bin/writingPrompts-event2story_bpe500 \
  --save-dir ../out/transformer-lm_bpe500_warmlr_format_pre2_coherence_dis_article_pre --arch transformer_lm_bpe_coherence_disa_gpt_format \
  --max-tokens 4000 \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_coherence_article --weight-decay .0001  \
  --tensorboard-logdir ../out/transformer-lm_bpe500_warmlr_format_pre2_coherence_dis_article_pre  \
  --log-interval  1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint ../fire_data/running/transformer-lm_bpe500_warmlr_format_pre2/checkpoint_best.pt

# rl by dis
CUDA_VISIBLE_DEVICES=6,7 python train_dis.py --user-dir coherence_story --task language_modeling_bpe_rl data-bin/writingPrompts-event2story_bpe500 \
  --save-dir ../out/transformer-lm_bpe500_warmlr_format_pre2_coherence_dis_rl --arch fairseqrl_gpt_format \
  --max-tokens 4000 \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_rl --weight-decay .0001  \
  --tensorboard-logdir ../out/transformer-lm_bpe500_warmlr_format_pre2_coherence_dis_rl  \
  --log-interval  100 --save-interval-updates 1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint-dis ../out/transformer-lm_bpe500_warmlr_format_pre2_coherence_dis/checkpoint_best.pt \
  --restore-file checkpoint_best_lm_story.pt

CUDA_VISIBLE_DEVICES=4,5 python train_dis.py --user-dir coherence_story --task language_modeling_bpe_rl data-bin/writingPrompts-event2story_bpe500 \
  --save-dir ../out/transformer-lm_bpe500_warmlr_format_pre2_coherence_dis_rl_09 --arch fairseqrl_gpt_format \
  --max-tokens 4000 \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_rl --weight-decay .0001  --mixed-weight 0.9 \
  --tensorboard-logdir ../out/transformer-lm_bpe500_warmlr_format_pre2_coherence_dis_rl_09  \
  --log-interval  100 --save-interval-updates 1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint-dis ../out/transformer-lm_bpe500_warmlr_format_pre2_coherence_dis/checkpoint_best.pt \
  --restore-file checkpoint_best_lm_story.pt
CUDA_VISIBLE_DEVICES=0,3 python train_dis.py --user-dir coherence_story --task language_modeling_bpe_rl data-bin/writingPrompts-event2story_bpe500 \
  --save-dir ../out/transformer-lm_bpe500_warmlr_format_pre2_coherence_dis_rl_05 --arch fairseqrl_gpt_format \
  --max-tokens 4000 \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_rl --weight-decay .0001  --mixed-weight 0.5 \
  --tensorboard-logdir ../out/transformer-lm_bpe500_warmlr_format_pre2_coherence_dis_rl_05  \
  --log-interval  100 --save-interval-updates 1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint-dis ../out/transformer-lm_bpe500_warmlr_format_pre2_coherence_dis/checkpoint_best.pt \
  --restore-file checkpoint_best_lm_story.pt

CUDA_VISIBLE_DEVICES=0,1,2,3 python train_dis.py --user-dir coherence_story --task language_modeling_bpe_rl data-bin/writingPrompts-event2story_bpe500 \
  --save-dir ../out/transformer-lm_bpe500_warmlr_format_pre2_coherence_dis_rl_en_05 --arch fairseqrl_gpt_format \
  --max-tokens 4000 \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_rl --weight-decay .0001  --mixed-weight 0.5 \
  --tensorboard-logdir ../out/transformer-lm_bpe500_warmlr_format_pre2_coherence_dis_rl_en_05  \
  --log-interval  100 --save-interval-updates 1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint-dis ../out/transformer-lm_bpe500_warmlr_format_pre2_coherence_dis_enhance/checkpoint_best.pt \
  --restore-file checkpoint_best_lm_story.pt
CUDA_VISIBLE_DEVICES=0,1,2,3 python train_dis.py --user-dir coherence_story --task language_modeling_bpe_rl data-bin/writingPrompts-event2story_bpe500 \
  --save-dir ../out/transformer-lm_bpe500_warmlr_format_pre2_coherence_dis_rl_en_09 --arch fairseqrl_gpt_format \
  --max-tokens 4000 \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_rl --weight-decay .0001  --mixed-weight 0.9 \
  --tensorboard-logdir ../out/transformer-lm_bpe500_warmlr_format_pre2_coherence_dis_rl_en_09  \
  --log-interval  100 --save-interval-updates 1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint-dis ../out/transformer-lm_bpe500_warmlr_format_pre2_coherence_dis_rl_en_09/checkpoint_best_dis.pt \
  --restore-file checkpoint_best_lm_story.pt

CUDA_VISIBLE_DEVICES=4,5,6,7 python train_dis.py --user-dir coherence_story --task language_modeling_bpe_rl data-bin/writingPrompts-event2story_bpe500 \
  --save-dir ../out/transformer-lm_bpe500_warmlr_format_pre2_coherence_dis_rl_ar_05 --arch fairseqrl_gpt_format \
  --max-tokens 4000 \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_rl --weight-decay .0001  --mixed-weight 0.5 \
  --tensorboard-logdir ../out/transformer-lm_bpe500_warmlr_format_pre2_coherence_dis_rl_ar_05  \
  --log-interval  100 --save-interval-updates 1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test --tokens-per-sample 1024 --sample-break-mode eos \
  --pretrained  --pretrained-checkpoint-dis ../out/transformer-lm_bpe500_warmlr_format_pre2_coherence_dis_article_pre/checkpoint_best.pt \
  --restore-file checkpoint_best_lm_story.pt

# gan model
CUDA_VISIBLE_DEVICES=0,1,2,3 python train_gan.py --user-dir coherence_story --task hierstory_bpe_gan data-bin/writingPrompts-srl2story_bpe500_fix \
  --save-dir ../out/bilstm_translm_coherence_gan --arch fairseqgan_gpt_format \
  --source-lang event --target-lang new  --max-tokens 1024 \
  --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
  --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
  --criterion story_adv --weight-decay .0001 --label-smoothing 0 \
  --tensorboard-logdir ../out/bilstm_translm_coherence_gan \
  --log-interval 100 --save-interval-updates 2000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
  --skip-invalid-size-inputs-valid-test \
  --pretrained  --pretrained-checkpoint ../out/bilstm_transformer-lm_bpe500_format_pre_srl_hier_pos_fix/checkpoint_best.pt \
  --pretrained-checkpoint-dis ../fire_data/running/transformer-lm_bpe500_warmlr_format_pre2/checkpoint_best.pt

# prompt to event model
#mkdir ../out/prompt2event
#CUDA_VISIBLE_DEVICES=0,1 fairseq-train --task translation data-bin/writingPrompts-prompt2event \
#    --save-dir ../out/prompt2event --arch lstm \
#    --source-lang prompt --target-lang event  --max-tokens 1500 \
#    --lr 0.25 --clip-norm 0.1 --max-target-positions 1500 --lr-scheduler reduce_lr_on_plateau \
#    --criterion label_smoothed_cross_entropy --weight-decay .0000001 --label-smoothing 0 \
#    --tensorboard-logdir ../out/prompt2event \
#    --log-interval  1000 --log-format simple --distributed-world-size 2
#mkdir ../out/prompt2event_500
#CUDA_VISIBLE_DEVICES=0,1 fairseq-train --task translation data-bin/writingPrompts-prompt2event_500 \
#    --save-dir ../out/prompt2event_500 --arch transformer_prompt_to_event \
#    --source-lang prompt --target-lang event --max-tokens 4000 \
#    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
#    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
#    --criterion label_smoothed_cross_entropy --weight-decay .0001 --label-smoothing 0.1 \
#    --tensorboard-logdir ../out/prompt2event_500 \
#    --log-interval 1000 --log-format simple --distributed-world-size 2 --dropout 0.3
#CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train --task translation data-bin/writingPrompts-prompt2srl2_500 \
#    --save-dir ../out/prompt2srl2_500 --arch transformer_prompt_to_event \
#    --source-lang prompt --target-lang event --max-tokens 4000 \
#    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
#    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
#    --criterion label_smoothed_cross_entropy --weight-decay .0001 --label-smoothing 0.1 \
#    --tensorboard-logdir ../out/prompt2srl2_500 \
#    --log-interval 1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
#    --skip-invalid-size-inputs-valid-test
CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train --task translation data-bin/writingPrompts-prompt2srl2_500 \
    --save-dir ../out/prompt2srl2_500_fine --arch transformer_prompt_to_event \
    --source-lang prompt --target-lang event --max-tokens 4000 \
    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
    --criterion cross_entropy_fa --weight-decay .0001 \
    --tensorboard-logdir ../out/prompt2srl2_500_fine \
    --log-interval 1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
    --skip-invalid-size-inputs-valid-test \
    --restore-file checkpoint_best.pt \
    --reset-optimizer --reset-lr-scheduler --reset-meters
CUDA_VISIBLE_DEVICES=6,7 fairseq-train --task translation data-bin/writingPrompts-prompt2srl2_500 \
    --save-dir ../out/prompt2srl2_500_wo_smooth_d1 --arch transformer_prompt_to_event \
    --source-lang prompt --target-lang event --max-tokens 4000 \
    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
    --criterion label_smoothed_cross_entropy --weight-decay .0001 --label-smoothing 0 \
    --tensorboard-logdir ../out/prompt2srl2_500_wo_smooth_d1 \
    --log-interval 1000 --log-format simple --distributed-world-size 2 --dropout 0.1 \
    --skip-invalid-size-inputs-valid-test
CUDA_VISIBLE_DEVICES=0,1 fairseq-train --task translation data-bin/writingPrompts-prompt2srl2_500 \
    --save-dir ../out/prompt2srl2_500_wo_smooth_d1_conv --arch fconv_self_att_wp \
    --source-lang prompt --target-lang event --max-tokens 4000 \
    --lr 0.25 --clip-norm 0.1 --lr-scheduler reduce_lr_on_plateau \
    --criterion label_smoothed_cross_entropy --weight-decay .0001 --label-smoothing 0 \
    --decoder-attention True --encoder-attention False --pretrained False \
    --gated-attention True --self-attention True --project-input True \
    --tensorboard-logdir ../out/prompt2srl2_500_wo_smooth_d1_conv \
    --log-interval 1000 --log-format simple --distributed-world-size 2 --dropout 0.1 \
    --skip-invalid-size-inputs-valid-test
CUDA_VISIBLE_DEVICES=0,1 fairseq-train --task translation data-bin/writingPrompts-prompt2srl2_500 \
    --save-dir ../out/prompt2srl2_500_fine_fix --arch transformer_prompt_to_event \
    --source-lang prompt --target-lang event --max-tokens 4000 \
    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
    --criterion cross_entropy_fa --weight-decay .0001 \
    --tensorboard-logdir ../out/prompt2srl2_500_fine_fix \
    --log-interval 1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
    --skip-invalid-size-inputs-valid-test \
    --restore-file checkpoint_best.pt \
    --reset-optimizer --reset-lr-scheduler --reset-meters
CUDA_VISIBLE_DEVICES=0,1 fairseq-train --task translation data-bin/writingPrompts-prompt2srl2_500 \
    --save-dir ../out/prompt2srl2_500_fine_fix2 --arch transformer_prompt_to_event \
    --source-lang prompt --target-lang event --max-tokens 4000 \
    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
    --criterion cross_entropy_fa --weight-decay .0001 \
    --tensorboard-logdir ../out/prompt2srl2_500_fine_fix2 \
    --log-interval 1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
    --skip-invalid-size-inputs-valid-test \
    --restore-file checkpoint_best.pt \
    --reset-optimizer --reset-lr-scheduler --reset-meters
CUDA_VISIBLE_DEVICES=0,1 fairseq-train --task translation data-bin/writingPrompts-prompt2srl2_500 \
    --save-dir ../out/prompt2srl2_500_wo_smooth_d1_verb --arch transformer_verb_prompt_to_event --use-verb-attention \
    --source-lang prompt --target-lang event --max-tokens 4000 \
    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
    --criterion label_smoothed_cross_entropy --weight-decay .0001 --label-smoothing 0 \
    --tensorboard-logdir ../out/prompt2srl2_500_wo_smooth_d1_verb \
    --log-interval 1000 --log-format simple --distributed-world-size 2 --dropout 0.1 \
    --skip-invalid-size-inputs-valid-test
CUDA_VISIBLE_DEVICES=0,1 python train.py --task translation data-bin/writingPrompts-prompt2srl2_500 \
    --save-dir ../out/prompt2srl2_500_wo_smooth_d1_gpt2 --arch transformer_prompt_to_event_big \
    --source-lang prompt --target-lang event --max-tokens 4000 \
    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
    --criterion label_smoothed_cross_entropy --weight-decay .0001 --label-smoothing 0 \
    --tensorboard-logdir ../out/prompt2srl2_500_wo_smooth_d1_gpt2 \
    --log-interval 1000 --log-format simple --distributed-world-size 2 --dropout 0.1 \
    --skip-invalid-size-inputs-valid-test
CUDA_VISIBLE_DEVICES=0,1 fairseq-train --task translation data-bin/writingPrompts-prompt2srl2_500 \
    --save-dir ../out/prompt2srl2_500_fine_fix2_reset --arch transformer_prompt_to_event \
    --source-lang prompt --target-lang event --max-tokens 4000 \
    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
    --criterion cross_entropy_fa --weight-decay .0001 \
    --tensorboard-logdir ../out/prompt2srl2_500_fine_fix2_reset \
    --log-interval 1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
    --skip-invalid-size-inputs-valid-test \
    --restore-file checkpoint_best.pt \
    --reset-optimizer --reset-lr-scheduler --reset-meters
CUDA_VISIBLE_DEVICES=2,3 fairseq-train --task translation data-bin/writingPrompts-prompt2srl2_500_nosympol \
    --save-dir ../out/prompt2srl2_500_wo_smooth_d1_gpt2_nosympol --arch transformer_prompt_to_event_big \
    --source-lang prompt --target-lang eventnosympol --max-tokens 4000 \
    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
    --criterion label_smoothed_cross_entropy --weight-decay .0001 --label-smoothing 0 \
    --tensorboard-logdir ../out/prompt2srl2_500_wo_smooth_d1_gpt2_nosympol \
    --log-interval 1000 --log-format simple --distributed-world-size 2 --dropout 0.1 \
    --skip-invalid-size-inputs-valid-test
CUDA_VISIBLE_DEVICES=0,1 fairseq-train --task translation data-bin/writingPrompts-prompt2srl2_500 \
    --save-dir ../out/prompt2srl2_500_gpt2 --arch transformer_prompt_to_event_big \
    --source-lang prompt --target-lang event --max-tokens 4000 \
    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
    --criterion label_smoothed_cross_entropy --weight-decay .0001 --label-smoothing 0.1 \
    --tensorboard-logdir ../out/prompt2srl2_500_gpt2 \
    --log-interval 1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
    --skip-invalid-size-inputs-valid-test
CUDA_VISIBLE_DEVICES=0,1 fairseq-train --task translation data-bin/writingPrompts-prompt2srl2_500_nosympol \
    --save-dir ../out/prompt2srl2_500_gpt2_nosympol --arch transformer_prompt_to_event_big \
    --source-lang prompt --target-lang eventnosympol --max-tokens 4000 \
    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
    --criterion label_smoothed_cross_entropy --weight-decay .0001 --label-smoothing 0.1 \
    --tensorboard-logdir ../out/prompt2srl2_500_gpt2_nosympol \
    --log-interval 1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
    --skip-invalid-size-inputs-valid-test
CUDA_VISIBLE_DEVICES=0,1 fairseq-train --task translation data-bin/writingPrompts-prompt2srl2_500 \
    --save-dir ../out/prompt2srl2_500_wo_smooth_d1_verb_loss --arch transformer_verb_loss_prompt_to_event \
    --source-lang prompt --target-lang event --max-tokens 4000 \
    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
    --criterion label_smoothed_cross_entropy --weight-decay .0001 --label-smoothing 0 \
    --tensorboard-logdir ../out/prompt2srl2_500_wo_smooth_d1_verb_loss \
    --log-interval 1000 --log-format simple --distributed-world-size 2 --dropout 0.1 \
    --skip-invalid-size-inputs-valid-test
CUDA_VISIBLE_DEVICES=0,1 fairseq-train --task translation data-bin/writingPrompts-prompt2srl2_500 \
    --save-dir ../out/prompt2srl2_500_wo_smooth_d1_verb_loss2 --arch transformer_verb_loss2_prompt_to_event \
    --source-lang prompt --target-lang event --max-tokens 4000 \
    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
    --criterion label_smoothed_cross_entropy --weight-decay .0001 --label-smoothing 0 \
    --tensorboard-logdir ../out/prompt2srl2_500_wo_smooth_d1_verb_loss2 \
    --log-interval 1000 --log-format simple --distributed-world-size 2 --dropout 0.1 \
    --skip-invalid-size-inputs-valid-test
CUDA_VISIBLE_DEVICES=2,3 fairseq-train --task translation data-bin/writingPrompts-prompt2srl2_500 \
    --save-dir ../out/prompt2srl2_500_wo_smooth_d1_verb_loss3 --arch transformer_verb_loss3_prompt_to_event \
    --source-lang prompt --target-lang event --max-tokens 4000 \
    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
    --criterion event_verb_cross_entropy --weight-decay .0001 --label-smoothing 0 \
    --tensorboard-logdir ../out/prompt2srl2_500_wo_smooth_d1_verb_loss3 \
    --log-interval 1000 --log-format simple --distributed-world-size 2 --dropout 0.1 \
    --skip-invalid-size-inputs-valid-test \
    --pretrained  --pretrained-checkpoint ../out/checkpoint_best.pt
CUDA_VISIBLE_DEVICES=0,1 fairseq-train --task translation data-bin/writingPrompts-prompt2srl2_500 \
    --save-dir ../out/prompt2srl2_500_wo_smooth_d1_verb_loss4 --arch transformer_verb_loss4_prompt_to_event \
    --source-lang prompt --target-lang event --max-tokens 4000 \
    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
    --criterion event_verb_cross_entropy --weight-decay .0001 --label-smoothing 0 \
    --tensorboard-logdir ../out/prompt2srl2_500_wo_smooth_d1_verb_loss4 \
    --log-interval 1000 --log-format simple --distributed-world-size 2 --dropout 0.1 \
    --skip-invalid-size-inputs-valid-test \
    --pretrained  --pretrained-checkpoint ../out/checkpoint_best.pt
CUDA_VISIBLE_DEVICES=0,1 fairseq-train --task translation data-bin/writingPrompts-prompt2srl2_500 \
    --save-dir ../out/prompt2srl2_500_wo_smooth_d1_verb_loss32 --arch transformer_verb_loss32_prompt_to_event \
    --source-lang prompt --target-lang event --max-tokens 4000 \
    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
    --criterion event_verb_cross_entropy --weight-decay .0001 --label-smoothing 0 \
    --tensorboard-logdir ../out/prompt2srl2_500_wo_smooth_d1_verb_loss32 \
    --log-interval 1000 --log-format simple --distributed-world-size 2 --dropout 0.1 \
    --skip-invalid-size-inputs-valid-test \
    --pretrained  --pretrained-checkpoint ../out/checkpoint_best.pt
CUDA_VISIBLE_DEVICES=0,1 fairseq-train --task translation data-bin/writingPrompts-prompt2srl2_500 \
    --save-dir ../out/prompt2srl2_500_wo_smooth_d1_verb_loss3e2 --arch transformer_verb_loss3_prompt_to_event \
    --source-lang prompt --target-lang event --max-tokens 4000 \
    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
    --criterion event_verb_cross_entropy --weight-decay .0001 --label-smoothing 0 \
    --tensorboard-logdir ../out/prompt2srl2_500_wo_smooth_d1_verb_loss3e2 --event-weight 0.2 \
    --log-interval 1000 --log-format simple --distributed-world-size 2 --dropout 0.1 \
    --skip-invalid-size-inputs-valid-test \
    --pretrained  --pretrained-checkpoint ../out/checkpoint_best.pt
CUDA_VISIBLE_DEVICES=2,3 fairseq-train --task translation data-bin/writingPrompts-prompt2srl2_500 \
    --save-dir ../out/prompt2srl2_500_wo_smooth_d1_verb_loss3e4 --arch transformer_verb_loss3_prompt_to_event \
    --source-lang prompt --target-lang event --max-tokens 4000 \
    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
    --criterion event_verb_cross_entropy --weight-decay .0001 --label-smoothing 0 \
    --tensorboard-logdir ../out/prompt2srl2_500_wo_smooth_d1_verb_loss3e4 --event-weight 0.4 \
    --log-interval 1000 --log-format simple --distributed-world-size 2 --dropout 0.1 \
    --skip-invalid-size-inputs-valid-test \
    --pretrained  --pretrained-checkpoint ../out/checkpoint_best.pt
CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train --task translation data-bin/writingPrompts-prompt2srl2_500 \
    --save-dir ../out/prompt2srl2_500_wo_smooth_d1_verb_loss322 --arch transformer_verb_loss322_prompt_to_event \
    --source-lang prompt --target-lang event --max-tokens 4000 \
    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
    --criterion event_verb_cross_entropy2 --weight-decay .0001 --label-smoothing 0 \
    --tensorboard-logdir ../out/prompt2srl2_500_wo_smooth_d1_verb_loss322  --event-weight 0.2 \
    --log-interval 1000 --log-format simple --distributed-world-size 4 --dropout 0.1 \
    --skip-invalid-size-inputs-valid-test
CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train --task translation data-bin/writingPrompts-prompt2srl2_500 \
    --save-dir ../out/prompt2srl2_500_wo_smooth_d1_verb_loss3224 --arch transformer_verb_loss322_prompt_to_event \
    --source-lang prompt --target-lang event --max-tokens 4000 \
    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
    --criterion event_verb_cross_entropy2 --weight-decay .0001 --label-smoothing 0 \
    --tensorboard-logdir ../out/prompt2srl2_500_wo_smooth_d1_verb_loss3224 --event-weight 0.4 \
    --log-interval 1000 --log-format simple --distributed-world-size 4 --dropout 0.1 \
    --skip-invalid-size-inputs-valid-test
CUDA_VISIBLE_DEVICES=0,1 fairseq-train --task translation data-bin/writingPrompts-prompt2srl2_500 \
    --save-dir ../out/prompt2srl2_500_wo_smooth_d1_verb_loss12 --arch transformer_verb_loss_prompt_to_event \
    --source-lang prompt --target-lang event --max-tokens 4000 \
    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
    --criterion event_verb_cross_entropy3 --weight-decay .0001 --label-smoothing 0 \
    --tensorboard-logdir ../out/prompt2srl2_500_wo_smooth_d1_verb_loss12 \
    --log-interval 1000 --log-format simple --distributed-world-size 2 --dropout 0.1 \
    --skip-invalid-size-inputs-valid-test
CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train --task translation data-bin/writingPrompts-prompt2srl2_500 \
    --save-dir ../out/prompt2srl2_500_wo_smooth_d1_verb_loss323 --arch transformer_verb_loss323_prompt_to_event \
    --source-lang prompt --target-lang event --max-tokens 4000 \
    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
    --criterion event_verb_cross_entropy2 --weight-decay .0001 --label-smoothing 0 \
    --tensorboard-logdir ../out/prompt2srl2_500_wo_smooth_d1_verb_loss323  --event-weight 0.2 \
    --log-interval 1000 --log-format simple --distributed-world-size 4 --dropout 0.1 \
    --skip-invalid-size-inputs-valid-test
CUDA_VISIBLE_DEVICES=2,3 fairseq-train --task translation data-bin/writingPrompts-prompt2srl2_500 \
    --save-dir ../out/prompt2srl2_500_wo_smooth_d1_verb_loss323_first --arch transformer_verb_loss323_prompt_to_event \
    --source-lang prompt --target-lang event --max-tokens 4000 \
    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
    --criterion event_verb_cross_entropy2 --weight-decay .0001 --label-smoothing 0 \
    --tensorboard-logdir ../out/prompt2srl2_500_wo_smooth_d1_verb_loss323_first  --event-weight 0.2 \
    --log-interval 1000 --log-format simple --distributed-world-size 2 --dropout 0.1 \
    --skip-invalid-size-inputs-valid-test
CUDA_VISIBLE_DEVICES=0,1 fairseq-train --task translation data-bin/writingPrompts-prompt2srl2_500 \
    --save-dir ../out/prompt2srl2_500_wo_smooth_d1_verb_loss13 --arch transformer_verb_loss13_prompt_to_event \
    --source-lang prompt --target-lang event --max-tokens 4000 \
    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
    --criterion event_verb_cross_entropy3 --weight-decay .0001 --label-smoothing 0 \
    --tensorboard-logdir ../out/prompt2srl2_500_wo_smooth_d1_verb_loss13 \
    --log-interval 1000 --log-format simple --distributed-world-size 2 --dropout 0.1 \
    --skip-invalid-size-inputs-valid-test
## prompt to story model
#CUDA_VISIBLE_DEVICES=0,1 fairseq-train --task translation_bpe data-bin/writingPrompts-prompt2story_bpe500 \
#    --save-dir ../out/prompt2story_bpe500 --arch transformer_prompt_to_event_bpe \
#    --source-lang prompt --target-lang new --max-tokens 4000 \
#    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
#    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
#    --criterion label_smoothed_cross_entropy --weight-decay .0001 --label-smoothing 0.1 \
#    --tensorboard-logdir ../out/prompt2story_bpe500 \
#    --log-interval 1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
#    --skip-invalid-size-inputs-valid-test
#CUDA_VISIBLE_DEVICES=0,1 fairseq-train --task translation data-bin/writingPrompts-prompt2story_500 \
#    --save-dir ../out/prompt2story_500 --arch transformer_prompt_to_event \
#    --source-lang prompt --target-lang new --max-tokens 4000 \
#    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
#    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
#    --criterion label_smoothed_cross_entropy --weight-decay .0001 --label-smoothing 0.1 \
#    --tensorboard-logdir ../out/prompt2story_500 \
#    --log-interval 1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
#    --skip-invalid-size-inputs-valid-test

# prompt to verb model
#CUDA_VISIBLE_DEVICES=2,3 fairseq-train --task translation data-bin/writingPrompts-prompt2verb_500 \
#    --save-dir ../out/prompt2verb_500 --arch transformer_prompt_to_event \
#    --source-lang prompt --target-lang verb --max-tokens 4000 \
#    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
#    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
#    --criterion label_smoothed_cross_entropy --weight-decay .0001 --label-smoothing 0.1 \
#    --tensorboard-logdir ../out/prompt2verb_500 \
#    --log-interval 1000 --log-format simple --distributed-world-size 2 --dropout 0.3 \
#    --skip-invalid-size-inputs-valid-test
#CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train --task translation data-bin/writingPrompts-prompt2mainverb_500 \
#    --save-dir ../out/prompt2mainverb_500 --arch transformer_prompt_to_event \
#    --source-lang prompt --target-lang mainverb --max-tokens 4000 \
#    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
#    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
#    --criterion label_smoothed_cross_entropy --weight-decay .0001 --label-smoothing 0.1 \
#    --tensorboard-logdir ../out/prompt2mainverb_500 \
#    --log-interval 1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
#    --skip-invalid-size-inputs-valid-test
#CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train --task translation data-bin/writingPrompts-prompt2srl_500 \
#    --save-dir ../out/prompt2srl_500 --arch transformer_prompt_to_event \
#    --source-lang prompt --target-lang event --max-tokens 4000 \
#    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
#    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
#    --criterion label_smoothed_cross_entropy --weight-decay .0001 --label-smoothing 0.1 \
#    --tensorboard-logdir ../out/prompt2srl_500 \
#    --log-interval 1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
#    --skip-invalid-size-inputs-valid-test
#CUDA_VISIBLE_DEVICES=0,1,2,3 fairseq-train --task translation data-bin/writingPrompts-prompt2srl_500 \
#    --save-dir ../out/prompt2srl_500_big --arch transformer_prompt_to_event_big \
#    --source-lang prompt --target-lang event --max-tokens 4000 \
#    --optimizer adam --adam-betas '(0.9, 0.98)'  --lr 0.0005  --lr-scheduler inverse_sqrt \
#    --warmup-updates 4000 --warmup-init-lr '1e-07' --min-lr '1e-09' \
#    --criterion label_smoothed_cross_entropy --weight-decay .0001 --label-smoothing 0.1 \
#    --tensorboard-logdir ../out/prompt2srl_500_big \
#    --log-interval 1000 --log-format simple --distributed-world-size 4 --dropout 0.3 \
#    --skip-invalid-size-inputs-valid-test